# import python libraries
import numpy as np, pandas as pd, geopandas as gpd, matplotlib.pyplot as plt
import geopy.distance, folium
# WWTPs (wastewater treatment plants) data
wwtps = pd.read_excel('WWTPs.xlsx', 'datasheet')
wwtps = gpd.GeoDataFrame(wwtps, crs='EPSG:4269',
geometry=gpd.points_from_xy(x=wwtps.Longitude,
y=wwtps.Latitude))
# oil refineries data
refinery = pd.read_excel('refineries.xlsx')
refinery = gpd.GeoDataFrame(refinery, crs='EPSG:4269',
geometry=gpd.points_from_xy(x=refinery.Longitude,
y=refinery.Latitude))
# US data
US = gpd.read_file('US_data/cb_2018_us_state_500k.shp')
US = US[['NAME', 'geometry']]
# just include the conterminous US
for excluded in ('Alaska',
'Hawaii',
'Puerto Rico',
'American Samoa',
'Commonwealth of the Northern Mariana Islands',
'Guam',
'United States Virgin Islands'):
US = US.loc[US['NAME'] != excluded]
# change crs to 3857 (unit: m)
wwtps = wwtps.to_crs(crs='EPSG:3857')
refinery = refinery.to_crs(crs='EPSG:3857')
US = US.to_crs(crs='EPSG:3857')
# spatial join of data to select out WWTPs and refineries that are in the conterminous US
wwtps = gpd.sjoin(wwtps, US)
wwtps = wwtps.drop(['index_right'], axis=1)
refinery = gpd.sjoin(refinery, US)
refinery = refinery.drop(['index_right'], axis=1)
# basic visualization
fig, ax = plt.subplots(figsize=(15,15))
ax.tick_params(top=False, bottom=False, left=False, right=False,
labelleft=False, labelbottom=False) # remove ticks
ax.set_frame_on(False) # remove frame
US.plot(ax=ax, color='w', edgecolor='k')
wwtps.plot(ax=ax, color='gray', alpha=0.3, markersize=5) # WWTPs shown as small gray dots
refinery.plot(ax=ax, color='lightgreen', markersize=100, edgecolor='k', linewidth=1)
# refineries shown as large green dots
# select out WWTPs that are within 100 km of refineries, one WWTP is only assigned once to its nearest refinery
# Since crs 3857 is not accurate at all in terms of distance calculation, especially more far away from the equator,
# I used a larger distance limits (150 km) as the max_distance in sjoin_nearest
# and then used geopy.distance.geodesic to recalculate the distance and filter out those that are actually longer than 100 km
wwtps_within = wwtps.sjoin_nearest(refinery, max_distance=150000, distance_col='distance')
wwtps_within['WWTP_location'] = list(zip(wwtps_within.Latitude_left, wwtps_within.Longitude_left))
wwtps_within['refinery_location'] = list(zip(wwtps_within.Latitude_right, wwtps_within.Longitude_right))
linear_distance = []
for i in range(len(wwtps_within)):
linear_distance.append(geopy.distance.geodesic(wwtps_within['WWTP_location'].iloc[i], wwtps_within['refinery_location'].iloc[i]).km)
wwtps_within['linear_distance'] = linear_distance
wwtps_within = wwtps_within[wwtps_within['linear_distance'] <= 100] # select out WWTPs that are within 100 km of refineries
wwtps_within.head()
# for every refinery: calculate the capacity of WWTPs (in MGD, million gallon per day) in its neighboring region,
# with the buffering distance increasing 1 km each step from 0 to 100 km
result = wwtps_within[['site_id']].drop_duplicates() # get the site_id of all refineries and remove duplicates
for distance in np.linspace(0, 100, 101):
wwtps_within_distance = wwtps_within[wwtps_within['linear_distance']<=distance] # select out WWTPs that are within certain distance
wwtps_within_distance = wwtps_within_distance.groupby('site_id').sum('Influent Flow (MMGal/d)') # sum WWTPs capacity based on their assigned refineries
wwtps_within_distance = wwtps_within_distance[['Influent Flow (MMGal/d)']]
wwtps_within_distance = wwtps_within_distance.rename(columns={'Influent Flow (MMGal/d)': int(distance)}) # use the corresponding distance as column name
wwtps_within_distance.reset_index(inplace=True)
if len(wwtps_within_distance) > 0:
result = result.merge(wwtps_within_distance, how='left', on='site_id') # merge the result at one distance to the final result
result = result.fillna(0) # fill NA/NAN with 0
# some refineries don't have WWTPs assigned to them, so the length of result can be smaller than that of refinery
# for those refineries, the values from 0 km to 100 km should be all 0
if len(result) < len(refinery):
result_index = pd.Index(result.site_id)
refinery_index = pd.Index(refinery.site_id)
refineries_left_id = refinery_index.difference(result_index).values
result = result.set_index('site_id')
for item in refineries_left_id:
result.loc[item] = [0]*len(result.columns)
result.head()
# add the refinery informaton to result
result = result.merge(refinery, how='left', on='site_id')
# group the result based on the location of refineries
PADD_1 = result[result['PADD']==1] # East Coast
PADD_2 = result[result['PADD']==2] # Midwest
PADD_3 = result[result['PADD']==3] # Gulf Coast
PADD_4 = result[result['PADD']==4] # Rocky Mountain
PADD_5 = result[result['PADD']==5] # West Coast
# for every single refinery, plot its cumulative WWTPs capacity as a function of buffering distance
# there will be five plots for five PADD regions and each line represents one refinery
fig, axes = plt.subplots(1, 5, figsize=(20,8))
order = -1
colors = [[0.376,0.757,0.812], # blue
[0.475,0.749,0.510], # green
[0.929,0.345,0.435], # red
[0.976,0.561,0.376], # orange
[0.953,0.765,0.329]] # yellow
titles = ['East Coast', 'Midwest', 'Gulf Coast', 'Rocky Mountain', 'West Coast']
for region in (PADD_1, PADD_2, PADD_3, PADD_4, PADD_5):
order += 1
for i in range(len(region)):
region.iloc[i,1:100].plot(ax=axes[order], c=colors[order])
axes[order].set_xlim(0,100)
axes[order].set_ylim(0,3000)
axes[order].tick_params(axis='x', labelsize=12)
axes[order].tick_params(axis='y', labelsize=12)
if order == 0:
axes[order].set_ylabel('Cumulative WWTPs capacity [MGD]',fontsize=20)
if order == 2:
axes[order].set_xlabel('Distance [km]',fontsize=20)
axes[order].set_title(titles[order], fontdict={'color':colors[order],'size':20})
# select out top 10 refineries that have largest cumulative WWTPs capacities within 100 km
site_id = result[['site_id',100]].sort_values(100, ascending=False).head(10).site_id
# make a visualization again, but this time, only refineries with the top 10 highest capacities will be shown
fig, ax = plt.subplots(figsize=(15,15))
ax.tick_params(top=False, bottom=False, left=False, right=False,
labelleft=False, labelbottom=False)
ax.set_frame_on(False)
US.plot(ax=ax, color='w', edgecolor='k')
wwtps.plot(ax=ax, color='gray', alpha=0.3, markersize=5) # WWTPs shown as small gray dots
top_10_refinery = refinery.loc[refinery['site_id'].isin(site_id)]
top_10_refinery.plot(ax=ax, marker='*', color=colors[2], markersize=500, edgecolor='k', linewidth=1) # shown as red star
# get the 100 km cumulative WWTPs capacity of NO.10 refinery
top_10_min = result.sort_values(100, ascending=False).head(10).iloc[-1][100]
# make a folium map that show top 10 refineries in red and others in blue
m = folium.Map(location=[39, -96],zoom_start=4)
for i in range(len(result)):
color = 'red' if result.iloc[i][100] >= top_10_min else 'blue'
folium.Marker([result.iloc[i]['Latitude'],result.iloc[i]['Longitude']],
popup=folium.Popup(f'''
Company: {result.iloc[i]["Company"]}<br>
Corp: {result.iloc[i]["Corp"]}<br>
State: {result.iloc[i]["State"]}<br>
PADD: {result.iloc[i]["PADD"]}<br>
WWTPs within 100 km: {result.iloc[i][100]:.2f} MGD
''',
max_width=500),
icon = folium.Icon(color = color, icon='industry', prefix='fa'), # use 'industry' icon
tooltip='click for more information').add_to(m)
m.save('US_refinery.html') # save the folium map
m